import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import plotly.express as px
sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter('ignore')
df = pd.read_csv(r'D:\Datasets\survey.csv')
df.head()
| Timestamp | Age | Gender | Country | state | self_employed | family_history | treatment | work_interfere | no_employees | remote_work | tech_company | benefits | care_options | wellness_program | seek_help | anonymity | leave | mental_health_consequence | phys_health_consequence | coworkers | supervisor | mental_health_interview | phys_health_interview | mental_vs_physical | obs_consequence | comments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-08-27 11:29:31 | 37 | Female | United States | IL | NaN | No | Yes | Often | 6-25 | No | Yes | Yes | Not sure | No | Yes | Yes | Somewhat easy | No | No | Some of them | Yes | No | Maybe | Yes | No | NaN |
| 1 | 2014-08-27 11:29:37 | 44 | M | United States | IN | NaN | No | No | Rarely | More than 1000 | No | No | Don't know | No | Don't know | Don't know | Don't know | Don't know | Maybe | No | No | No | No | No | Don't know | No | NaN |
| 2 | 2014-08-27 11:29:44 | 32 | Male | Canada | NaN | NaN | No | No | Rarely | 6-25 | No | Yes | No | No | No | No | Don't know | Somewhat difficult | No | No | Yes | Yes | Yes | Yes | No | No | NaN |
| 3 | 2014-08-27 11:29:46 | 31 | Male | United Kingdom | NaN | NaN | Yes | Yes | Often | 26-100 | No | Yes | No | Yes | No | No | No | Somewhat difficult | Yes | Yes | Some of them | No | Maybe | Maybe | No | Yes | NaN |
| 4 | 2014-08-27 11:30:22 | 31 | Male | United States | TX | NaN | No | No | Never | 100-500 | Yes | Yes | Yes | No | Don't know | Don't know | Don't know | Don't know | No | No | Some of them | Yes | Yes | Yes | Don't know | No | NaN |
df.shape
(1259, 27)
df.tail()
| Timestamp | Age | Gender | Country | state | self_employed | family_history | treatment | work_interfere | no_employees | remote_work | tech_company | benefits | care_options | wellness_program | seek_help | anonymity | leave | mental_health_consequence | phys_health_consequence | coworkers | supervisor | mental_health_interview | phys_health_interview | mental_vs_physical | obs_consequence | comments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1254 | 2015-09-12 11:17:21 | 26 | male | United Kingdom | NaN | No | No | Yes | NaN | 26-100 | No | Yes | No | No | No | No | Don't know | Somewhat easy | No | No | Some of them | Some of them | No | No | Don't know | No | NaN |
| 1255 | 2015-09-26 01:07:35 | 32 | Male | United States | IL | No | Yes | Yes | Often | 26-100 | Yes | Yes | Yes | Yes | No | No | Yes | Somewhat difficult | No | No | Some of them | Yes | No | No | Yes | No | NaN |
| 1256 | 2015-11-07 12:36:58 | 34 | male | United States | CA | No | Yes | Yes | Sometimes | More than 1000 | No | Yes | Yes | Yes | No | No | Don't know | Somewhat difficult | Yes | Yes | No | No | No | No | No | No | NaN |
| 1257 | 2015-11-30 21:25:06 | 46 | f | United States | NC | No | No | No | NaN | 100-500 | Yes | Yes | No | Yes | No | No | Don't know | Don't know | Yes | No | No | No | No | No | No | No | NaN |
| 1258 | 2016-02-01 23:04:31 | 25 | Male | United States | IL | No | Yes | Yes | Sometimes | 26-100 | No | No | Yes | Yes | No | No | Yes | Don't know | Maybe | No | Some of them | No | No | No | Don't know | No | NaN |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1259 entries, 0 to 1258 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Timestamp 1259 non-null object 1 Age 1259 non-null int64 2 Gender 1259 non-null object 3 Country 1259 non-null object 4 state 744 non-null object 5 self_employed 1241 non-null object 6 family_history 1259 non-null object 7 treatment 1259 non-null object 8 work_interfere 995 non-null object 9 no_employees 1259 non-null object 10 remote_work 1259 non-null object 11 tech_company 1259 non-null object 12 benefits 1259 non-null object 13 care_options 1259 non-null object 14 wellness_program 1259 non-null object 15 seek_help 1259 non-null object 16 anonymity 1259 non-null object 17 leave 1259 non-null object 18 mental_health_consequence 1259 non-null object 19 phys_health_consequence 1259 non-null object 20 coworkers 1259 non-null object 21 supervisor 1259 non-null object 22 mental_health_interview 1259 non-null object 23 phys_health_interview 1259 non-null object 24 mental_vs_physical 1259 non-null object 25 obs_consequence 1259 non-null object 26 comments 164 non-null object dtypes: int64(1), object(26) memory usage: 265.7+ KB
df.columns = df.columns.str.lower()
round(df.isna().sum()/df.shape[0]*100,2)
timestamp 0.00 age 0.00 gender 0.00 country 0.00 state 40.91 self_employed 1.43 family_history 0.00 treatment 0.00 work_interfere 20.97 no_employees 0.00 remote_work 0.00 tech_company 0.00 benefits 0.00 care_options 0.00 wellness_program 0.00 seek_help 0.00 anonymity 0.00 leave 0.00 mental_health_consequence 0.00 phys_health_consequence 0.00 coworkers 0.00 supervisor 0.00 mental_health_interview 0.00 phys_health_interview 0.00 mental_vs_physical 0.00 obs_consequence 0.00 comments 86.97 dtype: float64
df['country'].value_counts()
United States 751 United Kingdom 185 Canada 72 Germany 45 Ireland 27 Netherlands 27 Australia 21 France 13 India 10 New Zealand 8 Poland 7 Switzerland 7 Sweden 7 Italy 7 South Africa 6 Belgium 6 Brazil 6 Israel 5 Singapore 4 Bulgaria 4 Austria 3 Finland 3 Mexico 3 Russia 3 Denmark 2 Greece 2 Colombia 2 Croatia 2 Portugal 2 Moldova 1 Georgia 1 Bahamas, The 1 China 1 Thailand 1 Czech Republic 1 Norway 1 Romania 1 Nigeria 1 Japan 1 Hungary 1 Bosnia and Herzegovina 1 Uruguay 1 Spain 1 Zimbabwe 1 Latvia 1 Costa Rica 1 Slovenia 1 Philippines 1 Name: country, dtype: int64
df['state'].unique()
array(['IL', 'IN', nan, 'TX', 'TN', 'MI', 'OH', 'CA', 'CT', 'MD', 'NY',
'NC', 'MA', 'IA', 'PA', 'WA', 'WI', 'UT', 'NM', 'OR', 'FL', 'MN',
'MO', 'AZ', 'CO', 'GA', 'DC', 'NE', 'WV', 'OK', 'KS', 'VA', 'NH',
'KY', 'AL', 'NV', 'NJ', 'SC', 'VT', 'SD', 'ID', 'MS', 'RI', 'WY',
'LA', 'ME'], dtype=object)
df = df.drop(['country','state','timestamp','comments'], axis=1)
round(df.describe(include = 'all'),2)
| age | gender | self_employed | family_history | treatment | work_interfere | no_employees | remote_work | tech_company | benefits | care_options | wellness_program | seek_help | anonymity | leave | mental_health_consequence | phys_health_consequence | coworkers | supervisor | mental_health_interview | phys_health_interview | mental_vs_physical | obs_consequence | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.259000e+03 | 1259 | 1241 | 1259 | 1259 | 995 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 | 1259 |
| unique | NaN | 49 | 2 | 2 | 2 | 4 | 6 | 2 | 2 | 3 | 3 | 3 | 3 | 3 | 5 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 2 |
| top | NaN | Male | No | No | Yes | Sometimes | 6-25 | No | Yes | Yes | No | No | No | Don't know | Don't know | No | No | Some of them | Yes | No | Maybe | Don't know | No |
| freq | NaN | 615 | 1095 | 767 | 637 | 465 | 290 | 883 | 1031 | 477 | 501 | 842 | 646 | 819 | 563 | 490 | 925 | 774 | 516 | 1008 | 557 | 576 | 1075 |
| mean | 7.942815e+07 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| std | 2.818299e+09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| min | -1.726000e+03 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 25% | 2.700000e+01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 50% | 3.100000e+01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 75% | 3.600000e+01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| max | 1.000000e+11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df['gender'].value_counts().reset_index()
| index | gender | |
|---|---|---|
| 0 | Male | 615 |
| 1 | male | 206 |
| 2 | Female | 121 |
| 3 | M | 116 |
| 4 | female | 62 |
| 5 | F | 38 |
| 6 | m | 34 |
| 7 | f | 15 |
| 8 | Make | 4 |
| 9 | Male | 3 |
| 10 | Woman | 3 |
| 11 | Cis Male | 2 |
| 12 | Man | 2 |
| 13 | Female (trans) | 2 |
| 14 | Female | 2 |
| 15 | Trans woman | 1 |
| 16 | msle | 1 |
| 17 | male leaning androgynous | 1 |
| 18 | Neuter | 1 |
| 19 | cis male | 1 |
| 20 | queer | 1 |
| 21 | Female (cis) | 1 |
| 22 | 1 | |
| 23 | cis-female/femme | 1 |
| 24 | A little about you | 1 |
| 25 | Malr | 1 |
| 26 | p | 1 |
| 27 | femail | 1 |
| 28 | Cis Man | 1 |
| 29 | Guy (-ish) ^_^ | 1 |
| 30 | Enby | 1 |
| 31 | Agender | 1 |
| 32 | Androgyne | 1 |
| 33 | Male-ish | 1 |
| 34 | maile | 1 |
| 35 | Trans-female | 1 |
| 36 | Cis Female | 1 |
| 37 | something kinda male? | 1 |
| 38 | Mal | 1 |
| 39 | Male (CIS) | 1 |
| 40 | queer/she/they | 1 |
| 41 | non-binary | 1 |
| 42 | Femake | 1 |
| 43 | woman | 1 |
| 44 | Nah | 1 |
| 45 | All | 1 |
| 46 | fluid | 1 |
| 47 | Genderqueer | 1 |
| 48 | ostensibly male, unsure what that really means | 1 |
df['gender'].replace(['Male ', 'male', 'M', 'm', 'Male', 'Cis Male',
'Man', 'cis male', 'Mail', 'Male-ish', 'Male (CIS)',
'Cis Man', 'msle', 'Malr', 'Mal', 'maile', 'Make',], 'Male', inplace = True)
df['gender'].replace(['Female ', 'female', 'F', 'f', 'Woman', 'Female',
'femail', 'Cis Female', 'cis-female/femme', 'Femake', 'Female (cis)',
'woman',], 'Female', inplace = True)
df["gender"].replace(['Female (trans)', 'queer/she/they', 'non-binary',
'fluid', 'queer', 'Androgyne', 'Trans-female', 'male leaning androgynous',
'Agender', 'A little about you', 'Nah', 'All',
'ostensibly male, unsure what that really means',
'Genderqueer', 'Enby', 'p', 'Neuter', 'something kinda male?',
'Guy (-ish) ^_^', 'Trans woman',], 'Other', inplace = True)
df['gender'].value_counts()
Male 991 Female 247 Other 21 Name: gender, dtype: int64
df.loc[df.age<12, 'age'] =15
df.loc[df.age>75, 'age'] = 75
df[df['age'] > 80].head()
| age | gender | self_employed | family_history | treatment | work_interfere | no_employees | remote_work | tech_company | benefits | care_options | wellness_program | seek_help | anonymity | leave | mental_health_consequence | phys_health_consequence | coworkers | supervisor | mental_health_interview | phys_health_interview | mental_vs_physical | obs_consequence |
|---|
df['age'].hist()
<AxesSubplot:>
msno.matrix(df)
<AxesSubplot:>
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df,test_size=0.15,stratify=df['treatment'],random_state=42 )
print(f'Train Data Dimensions: {train_data.shape}\n Test data dimensions:{test_data.shape}')
Train Data Dimensions: (1070, 23) Test data dimensions:(189, 23)
health = train_data.copy()
health.columns
Index(['age', 'gender', 'self_employed', 'family_history', 'treatment',
'work_interfere', 'no_employees', 'remote_work', 'tech_company',
'benefits', 'care_options', 'wellness_program', 'seek_help',
'anonymity', 'leave', 'mental_health_consequence',
'phys_health_consequence', 'coworkers', 'supervisor',
'mental_health_interview', 'phys_health_interview',
'mental_vs_physical', 'obs_consequence'],
dtype='object')
plt.figure(figsize=(8,8))
px.histogram(health, x ='treatment', color ='treatment')
<Figure size 576x576 with 0 Axes>
px.histogram(health, x ='age', color = 'treatment')
#need to remove age12< entries as we perveive it to be false entries
px.histogram(health, x ='self_employed', color = 'treatment', barmode = 'group')
sns.countplot(data =health, x ='work_interfere')
<AxesSubplot:xlabel='work_interfere', ylabel='count'>
health['work_interfere'].value_counts(normalize = True)
Sometimes 0.468009 Never 0.210900 Rarely 0.174171 Often 0.146919 Name: work_interfere, dtype: float64
px.histogram(health, x='work_interfere', color ='treatment', barmode ='group')
#family history if mental illnesss
px.histogram(health, x= 'family_history', color ='treatment', barmode = 'group')
#remote work hours
sns.countplot(data = health, x ='remote_work')
<AxesSubplot:xlabel='remote_work', ylabel='count'>
px.histogram(health, x ='remote_work', color ='treatment',barmode ='group')
sns.countplot(data=health, x ='benefits')
<AxesSubplot:xlabel='benefits', ylabel='count'>
health['benefits'].value_counts(normalize = True)
Yes 0.377570 Don't know 0.324299 No 0.298131 Name: benefits, dtype: float64
px.histogram(health, x = 'benefits',color='treatment',barmode='group')
sns.countplot(data = health , x = 'care_options')
<AxesSubplot:xlabel='care_options', ylabel='count'>
health['care_options'].value_counts(normalize=True)
No 0.394393 Yes 0.357009 Not sure 0.248598 Name: care_options, dtype: float64
px.histogram(health, x = 'care_options',color='treatment',barmode='group')
#how many employees does your company have?
sns.countplot(data = health , x = 'no_employees')
<AxesSubplot:xlabel='no_employees', ylabel='count'>
px.histogram(health, x = 'no_employees',color='benefits',barmode='group')
px.histogram(health, x = 'no_employees',color='treatment',barmode='group')
#Has your employer ever discussed mental health as part of an employee wellness program
sns.countplot(data = health , x = 'wellness_program')
<AxesSubplot:xlabel='wellness_program', ylabel='count'>
px.histogram(health, x = 'wellness_program',color='treatment',barmode='group')
#Does your employer provide resources to learn more about mental health issues and how to seek help?¶
sns.countplot(data = health , x = 'seek_help')
<AxesSubplot:xlabel='seek_help', ylabel='count'>
px.histogram(health, x = 'seek_help',color='treatment',barmode='group')
#Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources?¶
sns.countplot(data = health , x = 'anonymity')
<AxesSubplot:xlabel='anonymity', ylabel='count'>
px.histogram(health, x = 'anonymity',color='treatment',barmode='group')
#How easy is it for you to take medical leave for a mental health condition?¶
sns.countplot(data = health , x = 'leave')
plt.xticks(rotation=45)
plt.show()
px.histogram(health, x = 'leave',color='treatment',barmode='group')
#Do you think that discussing a mental health issue with your employer would have negative consequences?¶
sns.countplot(data = health , x = 'mental_health_consequence' )
<AxesSubplot:xlabel='mental_health_consequence', ylabel='count'>
px.histogram(health, x = 'mental_health_consequence',color='treatment',barmode='group')
sns.countplot(data = health , x = 'coworkers' )
<AxesSubplot:xlabel='coworkers', ylabel='count'>
px.histogram(health, x = 'coworkers',color='treatment',barmode='group')
#Do you feel that your employer takes mental health as seriously as physical health?¶
sns.countplot(data = health , x = 'mental_vs_physical' )
<AxesSubplot:xlabel='mental_vs_physical', ylabel='count'>
px.histogram(health, x = 'mental_vs_physical',color='treatment',barmode='group')
#Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?¶
sns.countplot(data = health , x = 'obs_consequence')
<AxesSubplot:xlabel='obs_consequence', ylabel='count'>
Starting with we need to treat missing values for the columns 'work_interfere' and 'self_employed'
se_mode = train_data['self_employed'].mode().values[0]
train_data['self_employed'].fillna(se_mode,inplace = True)
train_data[train_data['work_interfere'].isna()]['treatment'].value_counts()
No 223 Yes 3 Name: treatment, dtype: int64
A 99% of the people who didn't answet the question 'Whether mental health interfers with work' don't seek help. If we think null values as a category, 'NEVER' of all categories of column_work_interfere.
train_data['work_interfere'].fillna('Never',inplace= True)
msno.bar(train_data, color = (0.5,0.2,0.35))
<AxesSubplot:>
As there are no missing values we proceed to the next step.
X_train = train_data.drop('treatment',axis=1)
Y_train = train_data['treatment'].copy()
All columns are categorical, except age.
train_data.columns
Index(['age', 'gender', 'self_employed', 'family_history', 'treatment',
'work_interfere', 'no_employees', 'remote_work', 'tech_company',
'benefits', 'care_options', 'wellness_program', 'seek_help',
'anonymity', 'leave', 'mental_health_consequence',
'phys_health_consequence', 'coworkers', 'supervisor',
'mental_health_interview', 'phys_health_interview',
'mental_vs_physical', 'obs_consequence'],
dtype='object')
gender_cols = ['Female','Male','Other']
self_employed_cols = ['No','Yes']
family_history_cols = ['No','Yes']
work_interfere_cols = ['Never','Rarely','Sometimes','Often']
no_employees_cols = ['1-5','6-25','26-100','100-500','500-1000','More than 1000']
remote_work_cols = ['No','Yes']
tech_company_cols = ['No','Yes']
benefits_cols = ['No','Don\'t know','Yes']
care_options_cols = ['No','Not sure','Yes']
wellness_program_cols =['No','Don\'t know','Yes']
seek_help_cols = ['No','Don\'t know','Yes']
anonymity_cols = ['No','Don\'t know','Yes']
leave_cols = [ 'Very easy', 'Somewhat easy',"Don't know" ,'Somewhat difficult','Very difficult']
mental_health_consequence_cols = ['No','Maybe','Yes']
phys_health_consequence_cols = ['No','Maybe','Yes']
coworkers_col = ['No','Some of them','Yes']
supervisor_cols = ['No','Some of them','Yes']
mental_health_interview_cols = ['No','Maybe','Yes']
phys_health_interview_cols = ['No','Maybe','Yes']
mental_vs_physical_cols = ["Don't know",'No','Yes']
obs_consequence_cols = ['No','Yes']
columns_for_encoder = [gender_cols,self_employed_cols,family_history_cols,work_interfere_cols,no_employees_cols,remote_work_cols,
tech_company_cols,benefits_cols,care_options_cols,wellness_program_cols,seek_help_cols,anonymity_cols,leave_cols,
mental_health_consequence_cols,phys_health_consequence_cols,coworkers_col,supervisor_cols,mental_health_interview_cols,
phys_health_interview_cols,mental_vs_physical_cols,obs_consequence_cols]
features = list(X_train.columns)
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder(categories = list(columns_for_encoder))
X_train[features[1:]] = ord_encoder.fit_transform(X_train.iloc[:,1:])
X_train.head()
| age | gender | self_employed | family_history | work_interfere | no_employees | remote_work | tech_company | benefits | care_options | wellness_program | seek_help | anonymity | leave | mental_health_consequence | phys_health_consequence | coworkers | supervisor | mental_health_interview | phys_health_interview | mental_vs_physical | obs_consequence | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 676 | 22 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 |
| 88 | 29 | 0.0 | 0.0 | 0.0 | 2.0 | 2.0 | 0.0 | 1.0 | 2.0 | 2.0 | 0.0 | 0.0 | 1.0 | 2.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 |
| 86 | 39 | 1.0 | 1.0 | 0.0 | 3.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 |
| 1210 | 24 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 505 | 46 | 1.0 | 0.0 | 0.0 | 2.0 | 5.0 | 0.0 | 1.0 | 2.0 | 2.0 | 2.0 | 2.0 | 1.0 | 2.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 |
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
X_train[features] = std_scaler.fit_transform(X_train)
X_train
| age | gender | self_employed | family_history | work_interfere | no_employees | remote_work | tech_company | benefits | care_options | wellness_program | seek_help | anonymity | leave | mental_health_consequence | phys_health_consequence | coworkers | supervisor | mental_health_interview | phys_health_interview | mental_vs_physical | obs_consequence | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 676 | -1.363472 | 0.419109 | -0.362047 | -0.803837 | -1.135675 | -0.243796 | -0.659027 | 0.480937 | -1.319345 | -1.111531 | -0.650242 | -0.869401 | -0.461953 | 0.267568 | 0.223297 | 1.212902 | 0.040848 | 1.062155 | 1.546143 | 0.357235 | -0.963586 | -0.406912 |
| 88 | -0.409844 | -1.904448 | -0.362047 | -0.803837 | 0.720967 | -0.243796 | -0.659027 | 0.480937 | 1.125156 | 1.197863 | -0.650242 | -0.869401 | -0.461953 | 0.267568 | 0.223297 | -0.569797 | 0.040848 | -0.124177 | -0.461591 | 0.357235 | 0.233902 | 2.457534 |
| 86 | 0.952483 | 0.419109 | 2.762070 | -0.803837 | 1.649288 | -0.831323 | -0.659027 | 0.480937 | -1.319345 | -1.111531 | -0.650242 | -0.869401 | -0.461953 | 2.108713 | 0.223297 | -0.569797 | 0.040848 | -1.310508 | -0.461591 | 0.357235 | 0.233902 | 2.457534 |
| 1210 | -1.091007 | 0.419109 | -0.362047 | 1.244033 | -1.135675 | -1.418850 | -0.659027 | 0.480937 | -1.319345 | -1.111531 | -0.650242 | -0.869401 | -0.461953 | 0.267568 | 0.223297 | 1.212902 | -1.577948 | -1.310508 | -0.461591 | -1.058475 | -0.963586 | -0.406912 |
| 505 | 1.906111 | 0.419109 | -0.362047 | -0.803837 | 0.720967 | 1.518785 | -0.659027 | 0.480937 | 1.125156 | 1.197863 | 1.893665 | 1.700376 | -0.461953 | 0.267568 | 0.223297 | -0.569797 | 0.040848 | -1.310508 | -0.461591 | -1.058475 | 1.431390 | -0.406912 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 988 | -1.227240 | 0.419109 | -0.362047 | 1.244033 | 0.720967 | -0.243796 | -0.659027 | 0.480937 | 1.125156 | 0.043166 | 1.893665 | 1.700376 | 1.396280 | -0.653004 | -1.082318 | -0.569797 | -1.577948 | -1.310508 | -0.461591 | 0.357235 | -0.963586 | -0.406912 |
| 598 | 1.497413 | 0.419109 | -0.362047 | -0.803837 | -1.135675 | 0.931258 | 1.517389 | 0.480937 | -0.097095 | 0.043166 | 0.621712 | 0.415487 | -0.461953 | 0.267568 | -1.082318 | -0.569797 | 1.659645 | 1.062155 | -0.461591 | -1.058475 | 1.431390 | -0.406912 |
| 363 | -0.001146 | 0.419109 | -0.362047 | 1.244033 | -0.207354 | 1.518785 | -0.659027 | -2.079275 | 1.125156 | 1.197863 | 1.893665 | 1.700376 | 1.396280 | 0.267568 | -1.082318 | -0.569797 | 0.040848 | 1.062155 | -0.461591 | 1.772946 | 1.431390 | -0.406912 |
| 690 | -1.227240 | 0.419109 | -0.362047 | -0.803837 | 0.720967 | -0.831323 | -0.659027 | 0.480937 | 1.125156 | -1.111531 | -0.650242 | -0.869401 | -0.461953 | -0.653004 | 0.223297 | 1.212902 | 0.040848 | -0.124177 | -0.461591 | -1.058475 | -0.963586 | -0.406912 |
| 191 | -0.546076 | 0.419109 | -0.362047 | 1.244033 | 0.720967 | -0.243796 | -0.659027 | 0.480937 | 1.125156 | -1.111531 | -0.650242 | -0.869401 | -0.461953 | -0.653004 | 0.223297 | 1.212902 | 0.040848 | -1.310508 | -0.461591 | 0.357235 | 0.233902 | -0.406912 |
1070 rows × 22 columns
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()
Y_train = lb_encoder.fit_transform(Y_train)
Data Preprocessing part ends here. Further, we move to Model Building.
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
def train_evaluate(model,X_train,Y_train,name):
model.fit(X_train,Y_train)
y_pred = model.predict(X_train)
f1_train = f1_score(Y_train,y_pred)
#Cross validation
f1_val = cross_val_score(model,X_train,Y_train,scoring='f1',cv=10)
# returning the scores
score = pd.DataFrame({'Name' : name ,'F1_score_trainset' : [f1_train], 'F1_score_validationset' : [f1_val.mean()]})
return score
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(penalty='l1',solver='liblinear')
train_evaluate(log_reg,X_train,Y_train,'Logistic Regression')
| Name | F1_score_trainset | F1_score_validationset | |
|---|---|---|---|
| 0 | Logistic Regression | 0.826521 | 0.817808 |
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_leaf_nodes=4,random_state=42)
train_evaluate(dt_clf,X_train,Y_train,'DecisionTreeClassifier')
| Name | F1_score_trainset | F1_score_validationset | |
|---|---|---|---|
| 0 | DecisionTreeClassifier | 0.851698 | 0.851769 |
from sklearn.svm import SVC
svc_clf = SVC()
train_evaluate(svc_clf, X_train, Y_train, 'Support Vector Classifier')
| Name | F1_score_trainset | F1_score_validationset | |
|---|---|---|---|
| 0 | Support Vector Classifier | 0.885159 | 0.834448 |
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(random_state=42)
train_evaluate(rnd_clf, X_train,Y_train, 'Random Forest Classifier')
| Name | F1_score_trainset | F1_score_validationset | |
|---|---|---|---|
| 0 | Random Forest Classifier | 1.0 | 0.842216 |
from sklearn.ensemble import AdaBoostClassifier
dt_clf_ada = DecisionTreeClassifier()
Ada_clf = AdaBoostClassifier(base_estimator = dt_clf_ada,random_state=42)
train_evaluate(Ada_clf,X_train,Y_train, "Ada Boost Classifier")
| Name | F1_score_trainset | F1_score_validationset | |
|---|---|---|---|
| 0 | Ada Boost Classifier | 1.0 | 0.767202 |
from sklearn.ensemble import GradientBoostingClassifier
gdb_clf = GradientBoostingClassifier(random_state=42,subsample=0.8)
train_evaluate(gdb_clf,X_train,Y_train, "Gradient Boosting Classifier")
| Name | F1_score_trainset | F1_score_validationset | |
|---|---|---|---|
| 0 | Gradient Boosting Classifier | 0.897596 | 0.850009 |
from xgboost import XGBClassifier
xbg_clf = XGBClassifier(verbosity=0)
train_evaluate(xbg_clf,X_train,Y_train, "XGBoost Classifier")
| Name | F1_score_trainset | F1_score_validationset | |
|---|---|---|---|
| 0 | XGBoost Classifier | 1.0 | 0.828707 |
Among the models, we narrow down the most promising models to fine tune the parameters further
from sklearn.model_selection import GridSearchCV
param_distribs = {
'kernel': ['linear', 'rbf','polynomial'],
'C': [0.01,0.01,0.1,0.15,0.2,0.25,0.5,0.75,1,2,10,100],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
}
svm_clf = SVC()
grid_cv = GridSearchCV(svm_clf , param_grid = param_distribs,
cv=5,scoring='f1',
verbose=1)
grid_cv.fit(X_train,Y_train)
grid_cv.best_estimator_
SVC(C=0.5, gamma=0.01)
train_evaluate(grid_cv.best_estimator_,X_train,y_train,"SVC Tuned")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-133-7cf7c05c6769> in <module> ----> 1 train_evaluate(grid_cv.best_estimator_,X_train,y_train,"SVC Tuned") NameError: name 'y_train' is not defined